Dataset Link: https://www.openml.org/search?type=data&sort=runs&id=31&status=active
I find that openml has a better description of the data. But the file is not in csv or excel format.
You can search online of how to convert the .arff file to .csv file.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hvplot.pandas

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
In [32]:
# read the data file

data = pd.read_csv('../Data/UCI_German/german_credit.csv')

# remove single quote in the columns names

data = data.rename(columns=lambda x: x.replace("'", ""))

data.head(10)
Out[32]:
checking_status duration credit_history purpose credit_amount savings_status employment installment_commitment personal_status other_parties residence_since property_magnitude age other_payment_plans housing existing_credits job num_dependents own_telephone foreign_worker class
0 '<0' 6 'critical/other existing credit' radio/tv 1169 'no known savings' '>=7' 4 'male single' none 4 'real estate' 67 none own 2 skilled 1 yes yes good
1 '0<=X<200' 48 'existing paid' radio/tv 5951 '<100' '1<=X<4' 2 'female div/dep/mar' none 2 'real estate' 22 none own 1 skilled 1 none yes bad
2 'no checking' 12 'critical/other existing credit' education 2096 '<100' '4<=X<7' 2 'male single' none 3 'real estate' 49 none own 1 'unskilled resident' 2 none yes good
3 '<0' 42 'existing paid' furniture/equipment 7882 '<100' '4<=X<7' 2 'male single' guarantor 4 'life insurance' 45 none 'for free' 1 skilled 2 none yes good
4 '<0' 24 'delayed previously' 'new car' 4870 '<100' '1<=X<4' 3 'male single' none 4 'no known property' 53 none 'for free' 2 skilled 2 none yes bad
5 'no checking' 36 'existing paid' education 9055 'no known savings' '1<=X<4' 2 'male single' none 4 'no known property' 35 none 'for free' 1 'unskilled resident' 2 yes yes good
6 'no checking' 24 'existing paid' furniture/equipment 2835 '500<=X<1000' '>=7' 3 'male single' none 4 'life insurance' 53 none own 1 skilled 1 none yes good
7 '0<=X<200' 36 'existing paid' 'used car' 6948 '<100' '1<=X<4' 2 'male single' none 2 car 35 none rent 1 'high qualif/self emp/mgmt' 1 yes yes good
8 'no checking' 12 'existing paid' radio/tv 3059 '>=1000' '4<=X<7' 2 'male div/sep' none 4 'real estate' 61 none own 1 'unskilled resident' 1 none yes good
9 '0<=X<200' 30 'critical/other existing credit' 'new car' 5234 '<100' unemployed 4 'male mar/wid' none 2 car 28 none own 2 'high qualif/self emp/mgmt' 1 none yes bad
In [3]:
data.describe()
Out[3]:
duration credit_amount installment_commitment residence_since age existing_credits num_dependents
count 1000.00 1000.00 1000.00 1000.00 1000.00 1000.00 1000.00
mean 20.90 3271.26 2.97 2.85 35.55 1.41 1.16
std 12.06 2822.74 1.12 1.10 11.38 0.58 0.36
min 4.00 250.00 1.00 1.00 19.00 1.00 1.00
25% 12.00 1365.50 2.00 2.00 27.00 1.00 1.00
50% 18.00 2319.50 3.00 3.00 33.00 1.00 1.00
75% 24.00 3972.25 4.00 4.00 42.00 2.00 1.00
max 72.00 18424.00 4.00 4.00 75.00 4.00 2.00
In [4]:
print(data["class"].value_counts())
data["class"].value_counts().hvplot(kind='bar', x='index', y='class', title='Class')
good    700
bad     300
Name: class, dtype: int64
Out[4]:
In [5]:
plt.figure(figsize=(8, 5))
sns.heatmap(data.corr(), annot=True, cmap='viridis')
Out[5]:
<AxesSubplot:>
In [33]:
feature = 'duration'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
Out[33]:
In [7]:
feature = 'credit_amount'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
Out[7]:
In [8]:
feature = 'age'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
Out[8]:
In [ ]:
 
In [9]:
print(data["checking_status"].value_counts())
data["checking_status"].value_counts().hvplot(kind='bar', x='index', y='checking_status', title='Checking Status')
'no checking'    394
'<0'             274
'0<=X<200'       269
'>=200'           63
Name: checking_status, dtype: int64
Out[9]:
In [10]:
print(data['credit_history'].value_counts())
data['credit_history'].value_counts().hvplot(kind='bar', x='index', y='credit_history', title='Credit History')
'existing paid'                     530
'critical/other existing credit'    293
'delayed previously'                 88
'all paid'                           49
'no credits/all paid'                40
Name: credit_history, dtype: int64
Out[10]:
In [11]:
print(data["purpose"].value_counts())
data["purpose"].value_counts().hvplot(kind='bar', x='index', y='purpose', title='Purpose')
radio/tv                280
'new car'               234
furniture/equipment     181
'used car'              103
business                 97
education                50
repairs                  22
'domestic appliance'     12
other                    12
retraining                9
Name: purpose, dtype: int64
Out[11]:
In [12]:
print(data["savings_status"].value_counts())
data["savings_status"].value_counts().hvplot(kind='bar', x='index', y='savings_status', title='Saving Status')
'<100'                603
'no known savings'    183
'100<=X<500'          103
'500<=X<1000'          63
'>=1000'               48
Name: savings_status, dtype: int64
Out[12]:
In [13]:
print(data["employment"].value_counts())
data["employment"].value_counts().hvplot(kind='bar', x='index', y='employment', title='Employment')
'1<=X<4'      339
'>=7'         253
'4<=X<7'      174
'<1'          172
unemployed     62
Name: employment, dtype: int64
Out[13]:
In [14]:
print(data["installment_commitment"].value_counts())
data["installment_commitment"].value_counts().hvplot(kind='bar', x='index', y='installment_commitment', title='Installment Commitment')
4    476
2    231
3    157
1    136
Name: installment_commitment, dtype: int64
Out[14]:
In [15]:
print(data["personal_status"].value_counts())
data["personal_status"].value_counts().hvplot(kind='bar', x='index', y='personal_status', title='Personal Status')
'male single'           548
'female div/dep/mar'    310
'male mar/wid'           92
'male div/sep'           50
Name: personal_status, dtype: int64
Out[15]:
In [16]:
print(data["other_parties"].value_counts())
data["other_parties"].value_counts().hvplot(kind='bar', x='index', y='other_parties', title='Other Parties')
none              907
guarantor          52
'co applicant'     41
Name: other_parties, dtype: int64
Out[16]:
In [17]:
print(data["residence_since"].value_counts())
data["residence_since"].value_counts().hvplot(kind='bar', x='index', y='residence_since', title='Residence Since')
4    413
2    308
3    149
1    130
Name: residence_since, dtype: int64
Out[17]:
In [18]:
print(data["property_magnitude"].value_counts())
data["property_magnitude"].value_counts().hvplot(kind='bar', x='index', y='property_magnitude', title='Property Magnitude')
car                    332
'real estate'          282
'life insurance'       232
'no known property'    154
Name: property_magnitude, dtype: int64
Out[18]:
In [19]:
print(data["other_payment_plans"].value_counts())
data["other_payment_plans"].value_counts().hvplot(kind='bar', x='index', y='other_payment_plans', title='Other Payment Plans')
none      814
bank      139
stores     47
Name: other_payment_plans, dtype: int64
Out[19]:
In [20]:
print(data["housing"].value_counts())
data["housing"].value_counts().hvplot(kind='bar', x='index', y='housing', title='Housing')
own           713
rent          179
'for free'    108
Name: housing, dtype: int64
Out[20]:
In [21]:
print(data["existing_credits"].value_counts())
data["existing_credits"].value_counts().hvplot(kind='bar', x='index', y='existing_credits', title='Existing Credits')
1    633
2    333
3     28
4      6
Name: existing_credits, dtype: int64
Out[21]:
In [22]:
print(data["job"].value_counts())
data["job"].value_counts().hvplot(kind='bar', x='index', y='job', title='Job')
skilled                        630
'unskilled resident'           200
'high qualif/self emp/mgmt'    148
'unemp/unskilled non res'       22
Name: job, dtype: int64
Out[22]:
In [23]:
print(data["num_dependents"].value_counts())
data["num_dependents"].value_counts().hvplot(kind='bar', x='index', y='num_dependents', title='Num Dependents')
1    845
2    155
Name: num_dependents, dtype: int64
Out[23]:
In [24]:
print(data["own_telephone"].value_counts())
data["own_telephone"].value_counts().hvplot(kind='bar', x='index', y='own_telephone', title='Telephone Owner')
none    596
yes     404
Name: own_telephone, dtype: int64
Out[24]:
In [25]:
print(data["foreign_worker"].value_counts())
data["foreign_worker"].value_counts().hvplot(kind='bar', x='index', y='foreign_worker', title='Foreign Worker')
yes    963
no      37
Name: foreign_worker, dtype: int64
Out[25]:

XGB¶

In [26]:
from sklearn.metrics import confusion_matrix, roc_auc_score

def get_experiment_result(y_true, y_pred_proba):
    cm = confusion_matrix(y_true, np.round(y_pred_proba))

    TP = cm[1,1]
    FP = cm[0,1]
    FN = cm[1,0]
    TN = cm[0,0]

    a = TP/(TP+FN)
    b = TN/(TN+FP)
    auc = roc_auc_score(y_true, y_pred_proba)

    print("Non-Default Recall: %.3f%%" % (b))
    print("Default Recall: %.3f%%" % (a))
    print("AUC: %.3f%%" % (auc))
In [34]:
# Mapping the labels
data['class'] = data['class'].map({'good': 0, 'bad': 1})


# One-hot Encoding the categorical features
categorical_feature = ['checking_status', 'credit_history', 'purpose', 'savings_status',
                       'employment', 'personal_status', 'other_parties', 'property_magnitude',
                       'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
data = pd.get_dummies(data, columns=categorical_feature, prefix= categorical_feature)
In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

if "Set" not in data.columns:
    data["Set"] = np.random.choice(["train", "test"], p =[.8, .2], size=(data.shape[0],))

train_indices = data[data.Set=="train"].index
test_indices = data[data.Set=="test"].index
data = data.drop(columns=['Set'])

y_train = data["class"].values[train_indices]
y_test = data["class"].values[test_indices]
data = data.drop("class", axis=1)

X_train = data.values[train_indices]
X_test = data.values[test_indices]
In [36]:
print("Training Instances: ", X_train.shape[0])
print("Test Instances: ", X_test.shape[0])
np.bincount(y_train)
Training Instances:  804
Test Instances:  196
Out[36]:
array([568, 236])

Balancing the data¶

In [37]:
from imblearn.under_sampling import RandomUnderSampler

# Resample using random undersampling
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)

# NORMALIZE & CONVERT TO NUMPY ARRAYS
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [38]:
print("Training Instances: ", X_train.shape[0])
print("Test Instances: ", X_test.shape[0])
np.bincount(y_train)
Training Instances:  472
Test Instances:  196
Out[38]:
array([236, 236])
In [39]:
from xgboost import XGBClassifier

# Create an XGBoost classifier object
xgb_clf = XGBClassifier()

# Train the XGBoost classifier on the training data
xgb_clf.fit(X_train, y_train)

y_pred_proba = xgb_clf.predict_proba(X_test)

get_experiment_result(y_test, y_pred_proba[:, 1])
Non-Default Recall: 0.720%
Default Recall: 0.656%
AUC: 0.756%

Normal Training¶

In [30]:
# NORMALIZE & CONVERT TO NUMPY ARRAYS
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [31]:
from xgboost import XGBClassifier

# Create an XGBoost classifier object
xgb_clf = XGBClassifier()

# Train the XGBoost classifier on the training data
xgb_clf.fit(X_train, y_train)

y_pred_proba = xgb_clf.predict_proba(X_test)

get_experiment_result(y_test, y_pred_proba[:, 1])
Non-Default Recall: 0.802%
Default Recall: 0.591%
AUC: 0.772%
In [ ]: